import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
raw_data = pd.read_csv('Housing_Data.csv')
raw_data
| Avg. Area Income | Avg. Area House Age | Avg. Area Number of Rooms | Avg. Area Number of Bedrooms | Area Population | Price | Address | |
|---|---|---|---|---|---|---|---|
| 0 | 79545.458574 | 5.682861 | 7.009188 | 4.09 | 23086.800503 | 1.059034e+06 | 208 Michael Ferry Apt. 674\nLaurabury, NE 3701... |
| 1 | 79248.642455 | 6.002900 | 6.730821 | 3.09 | 40173.072174 | 1.505891e+06 | 188 Johnson Views Suite 079\nLake Kathleen, CA... |
| 2 | 61287.067179 | 5.865890 | 8.512727 | 5.13 | 36882.159400 | 1.058988e+06 | 9127 Elizabeth Stravenue\nDanieltown, WI 06482... |
| 3 | 63345.240046 | 7.188236 | 5.586729 | 3.26 | 34310.242831 | 1.260617e+06 | USS Barnett\nFPO AP 44820 |
| 4 | 59982.197226 | 5.040555 | 7.839388 | 4.23 | 26354.109472 | 6.309435e+05 | USNS Raymond\nFPO AE 09386 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 4995 | 60567.944140 | 7.830362 | 6.137356 | 3.46 | 22837.361035 | 1.060194e+06 | USNS Williams\nFPO AP 30153-7653 |
| 4996 | 78491.275435 | 6.999135 | 6.576763 | 4.02 | 25616.115489 | 1.482618e+06 | PSC 9258, Box 8489\nAPO AA 42991-3352 |
| 4997 | 63390.686886 | 7.250591 | 4.805081 | 2.13 | 33266.145490 | 1.030730e+06 | 4215 Tracy Garden Suite 076\nJoshualand, VA 01... |
| 4998 | 68001.331235 | 5.534388 | 7.130144 | 5.44 | 42625.620156 | 1.198657e+06 | USS Wallace\nFPO AE 73316 |
| 4999 | 65510.581804 | 5.992305 | 6.792336 | 4.07 | 46501.283803 | 1.298950e+06 | 37778 George Ridges Apt. 509\nEast Holly, NV 2... |
5000 rows × 7 columns
sns.pairplot(raw_data)
<seaborn.axisgrid.PairGrid at 0x2a9c0f8f250>
x = raw_data[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
'Avg. Area Number of Bedrooms', 'Area Population']]
y = raw_data['Price']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)
LinearRegression()
print(model.coef_)
[2.16439168e+01 1.63855631e+05 1.19945424e+05 2.20796129e+03 1.52027854e+01]
print(model.intercept_)
-2627798.9504352147
pd.DataFrame(model.coef_, x.columns, columns = ['Coeff'])
| Coeff | |
|---|---|
| Avg. Area Income | 21.643917 |
| Avg. Area House Age | 163855.630531 |
| Avg. Area Number of Rooms | 119945.424082 |
| Avg. Area Number of Bedrooms | 2207.961291 |
| Area Population | 15.202785 |
predictions = model.predict(x_test)
plt.scatter(y_test, predictions)
<matplotlib.collections.PathCollection at 0x2a9c8f42e20>
plt.hist(y_test - predictions)
(array([ 4., 26., 104., 222., 282., 355., 271., 158., 65., 13.]),
array([-327857.18771484, -266217.02064656, -204576.85357828,
-142936.68651 , -81296.51944171, -19656.35237343,
41983.81469485, 103623.98176313, 165264.14883141,
226904.31589969, 288544.48296797]),
<BarContainer object of 10 artists>)
from sklearn import metrics
metrics.mean_absolute_error(y_test, predictions)
82246.42443937554
metrics.mean_squared_error(y_test, predictions)
10366906621.779713
np.sqrt(metrics.mean_squared_error(y_test, predictions))
101818.00735518111